# install stuff that we need later
if (!require("DT")) install.packages('DT')
if (!require("ggplot2")) install.packages('ggplot2')
if (!require("tidyverse")) install.packages('tidyverse')
if (!require("hrbrthemes")) install.packages('hrbrthemes')
if (!require("dplyr")) install.packages('dplyr')
# Load stuff we need later
library(readr)
library(DT)
library(ggplot2)
library(tidyverse)
library(hrbrthemes)
library(dplyr)
# and set the working directory
setwd("~/projects/bbs-for-independence/03_workspace")
# Read dataset summary from csv
dataset <- read_csv("../.tmp/dataset.csv", show_col_types = FALSE)
dataset$charratioDelta = dataset$charratioB - dataset$charratioA
# Check the average of length, length_raw, avgcolumnsize, charratioA and charratioB
groups = aggregate(dataset[, c(4,5,6,7,8,13)], list(dataset$category), FUN = function(x) round(mean(suppressWarnings(as.numeric(as.character(x))), na.rm=TRUE), digits = 2))
cat("Anzahl Dateien: ", nrow(dataset))
## Anzahl Dateien: 107364
cat("Anzahl Kategorien: ", nrow(groups))
## Anzahl Kategorien: 46
groups %>% arrange(desc(charratioB))
| Group.1 | length | length_raw | avgcolumnsize | charratioA | charratioB | charratioDelta |
|---|---|---|---|---|---|---|
| sex | 25245.31 | 25689.20 | 66.57 | 0.73 | 0.95 | 0.22 |
| digest | 34519.19 | 34542.65 | 58.01 | 0.74 | 0.94 | 0.20 |
| etext | 299102.23 | 301868.73 | 56.19 | 0.72 | 0.94 | 0.22 |
| law | 29771.61 | 30244.16 | 61.56 | 0.72 | 0.94 | 0.22 |
| politics | 28007.38 | 28347.29 | 61.24 | 0.69 | 0.94 | 0.25 |
| stories | 27076.01 | 27511.67 | 64.07 | 0.71 | 0.94 | 0.23 |
| news | 12971.45 | 13154.94 | 79.36 | 0.73 | 0.93 | 0.20 |
| occult | 27980.03 | 28410.65 | 61.51 | 0.70 | 0.92 | 0.22 |
| sf | 38351.08 | 38987.91 | 56.19 | 0.69 | 0.92 | 0.22 |
| survival | 16287.49 | 16547.33 | 63.37 | 0.68 | 0.92 | 0.25 |
| drugs | 17921.99 | 18082.32 | 58.79 | 0.69 | 0.91 | 0.22 |
| uploads | 8816.45 | 8845.72 | 95.82 | 0.68 | 0.91 | 0.22 |
| adventure | 12030.71 | 12210.70 | 66.74 | 0.66 | 0.90 | 0.24 |
| apple | 17961.37 | 18178.18 | 71.89 | 0.63 | 0.90 | 0.26 |
| conspiracy | 21382.73 | 21588.19 | 58.61 | 0.69 | 0.90 | 0.21 |
| food | 9869.62 | 10089.01 | 48.61 | 0.66 | 0.90 | 0.24 |
| fun | 25376.78 | 25846.89 | 58.72 | 0.67 | 0.90 | 0.24 |
| humor | 13400.88 | 13677.52 | 52.81 | 0.68 | 0.90 | 0.22 |
| rpg | 41113.61 | 41716.81 | 56.43 | 0.67 | 0.90 | 0.24 |
| anarchy | 14598.59 | 14846.09 | 62.45 | 0.63 | 0.89 | 0.25 |
| media | 38898.56 | 39541.82 | 52.57 | 0.65 | 0.89 | 0.24 |
| ufo | 12556.73 | 12795.85 | 60.54 | 0.67 | 0.89 | 0.22 |
| 100 | 28605.01 | 28974.88 | 61.36 | 0.65 | 0.88 | 0.24 |
| internet | 44197.64 | 44858.94 | 54.36 | 0.66 | 0.88 | 0.22 |
| games | 18859.54 | 19132.80 | 60.54 | 0.63 | 0.87 | 0.25 |
| groups | 13316.24 | 13483.25 | 125.72 | 0.60 | 0.87 | 0.27 |
| hacking | 28084.39 | 28523.15 | 62.64 | 0.62 | 0.87 | 0.25 |
| magazines | 26874.41 | 27104.82 | 120.86 | 0.63 | 0.86 | 0.23 |
| music | 24969.30 | 25322.22 | 52.15 | 0.60 | 0.86 | 0.26 |
| reports | 11470.83 | 11667.04 | 67.95 | 0.62 | 0.86 | 0.24 |
| virus | 14520.71 | 14823.81 | 58.25 | 0.58 | 0.86 | 0.28 |
| programming | 37479.33 | 38219.20 | 55.93 | 0.58 | 0.85 | 0.28 |
| computers | 22598.36 | 23046.15 | 56.02 | 0.58 | 0.84 | 0.26 |
| holiday | 5166.49 | 5275.23 | 140.58 | 0.62 | 0.84 | 0.22 |
| phreak | 15324.95 | 15617.78 | 60.04 | 0.57 | 0.84 | 0.26 |
| messages | 41279.98 | 42040.92 | 50.41 | 0.61 | 0.82 | 0.21 |
| hamradio | 14198.19 | 14469.32 | 55.36 | 0.51 | 0.81 | 0.31 |
| science | 18272.69 | 18503.92 | 58.03 | 0.58 | 0.81 | 0.23 |
| bbs | 25039.61 | 25456.31 | 62.75 | 0.53 | 0.80 | 0.27 |
| art | 27190.70 | 27231.47 | 86.18 | 0.23 | 0.74 | 0.51 |
| piracy | 9269.16 | 9417.98 | 85.37 | 0.35 | 0.73 | 0.38 |
| history | 18394.00 | 18422.78 | 55.16 | 0.49 | 0.61 | 0.12 |
| artifacts | 27249.19 | 27378.12 | 820.73 | 0.39 | 0.48 | 0.09 |
| exhibits | 57330.67 | 57331.80 | 62.64 | 0.31 | 0.38 | 0.07 |
| tap | 227088.44 | 227092.95 | 35.22 | 0.25 | 0.28 | 0.03 |
| floppies | 986584.93 | 986598.72 | 34.40 | 0.25 | 0.27 | 0.03 |
dataset %>%
ggplot( aes(x=reorder(category, charratioA, FUN = median),
y=charratioA, group=category)) +
geom_boxplot() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
geom_jitter(color="black", size=0.4, alpha=0.05) +
stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
coord_flip() +
ylim(0, 1) +
xlab("Kategorie") +
ylab("Verhältnis")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 91 rows containing missing values (geom_point).
# create plot: charratioB
dataset %>%
ggplot( aes(x=reorder(category, charratioB, FUN = median),
y=charratioB, group=category)) +
geom_boxplot() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
geom_jitter(color="black", size=0.4, alpha=0.05) +
stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
coord_flip() +
ylim(0, 1) +
xlab("Kategorie") +
ylab("Verhältnis")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 76 rows containing missing values (geom_point).
dataset %>%
ggplot( aes(x=reorder(category, charratioA-charratioB, FUN = median),
y=charratioB-charratioA, group=category)) +
geom_boxplot() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
geom_jitter(color="black", size=0.4, alpha=0.05) +
stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
coord_flip() +
ylim(0, 1) +
xlab("Kategorie") +
ylab("Differenz beiden Verhältnissen")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Removed 249 rows containing missing values (geom_point).